Show code
dataexp_level_exclusion_threshold <- 100
dataexp_cat_level_count <- 40
dataexp_hist_bins_count <- 50This workbook was created using the ‘dataexpks’ template:
https://github.com/DublinLearningGroup/dataexpks
This workbook performs the basic data exploration of the dataset.
dataexp_level_exclusion_threshold <- 100
dataexp_cat_level_count <- 40
dataexp_hist_bins_count <- 50First we load the dataset.
rawdata_tbl <- read_parquet("data/lifeins_policybook_inoutforce.parquet")
glimpse(rawdata_tbl)Rows: 1,500,000
Columns: 25
$ policy_id <chr> "C010000009", "C010000019", "C010000032", "C01…
$ countyname <chr> "Dublin City", "Kilkenny County", "South Dubli…
$ edname <chr> "North Dock B", "Kilkenny Rural", "Clondalkin-…
$ nuts3name <chr> "Dublin", "South-East (IE)", "Dublin", "West",…
$ sa_id <chr> "A268108011", "A097063020", "A267050012", "A06…
$ cluster_id <chr> "n6_c0", "n6_c0", "n6_c4", "n6_c1", "n6_c5", "…
$ prod_type <fct> protection, protection, pension, pension, prot…
$ prem_type <chr> "RP", "RP", "SP", "RP", "RP", "RP", "RP", "RP"…
$ prem_freq <chr> "12", "12", NA, "12", "12", "12", "12", "12", …
$ prem_ape <dbl> 4172.34, 1150.17, 600.00, 3552.98, 313.05, 731…
$ prem_risk <dbl> 2980.2396, 821.5525, NA, NA, 223.6095, 522.815…
$ policy_startdate <date> 1990-01-02, 1990-01-02, 1990-01-02, 1990-01-0…
$ policy_enddate <date> 2010-01-02, 2000-01-02, 2067-06-13, 2091-01-1…
$ policy_duration <int> 20, 10, NA, NA, 20, 15, 20, 15, 15, 20, 20, 5,…
$ mort_rating <dbl> 200, 100, NA, NA, 200, 100, NA, 200, 100, 100,…
$ sum_assured <dbl> 150000, 450000, NA, NA, 250000, 450000, NA, 50…
$ dob_life1 <date> 1937-07-22, 1964-08-19, 1947-06-13, 1971-01-1…
$ gender_life1 <chr> "F", "M", "F", "M", "F", "F", "M", "M", "M", "…
$ smoker_life1 <chr> "S", "N", "N", "N", "S", "N", "S", "S", "N", "…
$ isjointlife <lgl> TRUE, TRUE, NA, NA, FALSE, TRUE, NA, FALSE, FA…
$ islifeonly <lgl> TRUE, FALSE, NA, NA, TRUE, TRUE, NA, TRUE, TRU…
$ mortgage_status <chr> "MORTDECR", "MORTDECR", NA, NA, "MORTDECR", "T…
$ policy_status <chr> "lapsed", "lapsed", "lapsed", "lapsed", "lapse…
$ policy_statuschangedate <date> 1998-10-02, 1993-08-02, 1998-04-02, 1996-09-0…
$ lapsed <lgl> TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE…
### _TEMPLATE_
### Do simple datatype transforms and save output in data_tbl
data_tbl <- rawdata_tbl %>%
set_colnames(
names(.) |> to_snake_case()
)
glimpse(data_tbl)Rows: 1,500,000
Columns: 25
$ policy_id <chr> "C010000009", "C010000019", "C010000032", "C01…
$ countyname <chr> "Dublin City", "Kilkenny County", "South Dubli…
$ edname <chr> "North Dock B", "Kilkenny Rural", "Clondalkin-…
$ nuts_3_name <chr> "Dublin", "South-East (IE)", "Dublin", "West",…
$ sa_id <chr> "A268108011", "A097063020", "A267050012", "A06…
$ cluster_id <chr> "n6_c0", "n6_c0", "n6_c4", "n6_c1", "n6_c5", "…
$ prod_type <fct> protection, protection, pension, pension, prot…
$ prem_type <chr> "RP", "RP", "SP", "RP", "RP", "RP", "RP", "RP"…
$ prem_freq <chr> "12", "12", NA, "12", "12", "12", "12", "12", …
$ prem_ape <dbl> 4172.34, 1150.17, 600.00, 3552.98, 313.05, 731…
$ prem_risk <dbl> 2980.2396, 821.5525, NA, NA, 223.6095, 522.815…
$ policy_startdate <date> 1990-01-02, 1990-01-02, 1990-01-02, 1990-01-0…
$ policy_enddate <date> 2010-01-02, 2000-01-02, 2067-06-13, 2091-01-1…
$ policy_duration <int> 20, 10, NA, NA, 20, 15, 20, 15, 15, 20, 20, 5,…
$ mort_rating <dbl> 200, 100, NA, NA, 200, 100, NA, 200, 100, 100,…
$ sum_assured <dbl> 150000, 450000, NA, NA, 250000, 450000, NA, 50…
$ dob_life_1 <date> 1937-07-22, 1964-08-19, 1947-06-13, 1971-01-1…
$ gender_life_1 <chr> "F", "M", "F", "M", "F", "F", "M", "M", "M", "…
$ smoker_life_1 <chr> "S", "N", "N", "N", "S", "N", "S", "S", "N", "…
$ isjointlife <lgl> TRUE, TRUE, NA, NA, FALSE, TRUE, NA, FALSE, FA…
$ islifeonly <lgl> TRUE, FALSE, NA, NA, TRUE, TRUE, NA, TRUE, TRU…
$ mortgage_status <chr> "MORTDECR", "MORTDECR", NA, NA, "MORTDECR", "T…
$ policy_status <chr> "lapsed", "lapsed", "lapsed", "lapsed", "lapse…
$ policy_statuschangedate <date> 1998-10-02, 1993-08-02, 1998-04-02, 1996-09-0…
$ lapsed <lgl> TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, TRUE…
We now create derived features useful for modelling. These values are new variables calculated from existing variables in the data.
data_tbl <- rawdata_tbl |>
filter(prod_type == "protection") |>
mutate(
weeks_to_now = difftime(
as.Date("2016-01-01"), policy_startdate, units = "weeks"
) |> as.numeric(),
weeks_to_status = difftime(
policy_statuschangedate, policy_startdate, units = "weeks"
) |> as.numeric(),
policy_lifetime = if_else(
policy_status == "inforce",
weeks_to_now,
weeks_to_status
)
)
glimpse(data_tbl)Rows: 773,337
Columns: 28
$ policy_id <chr> "C010000009", "C010000019", "C010000091", "C01…
$ countyname <chr> "Dublin City", "Kilkenny County", "Cork County…
$ edname <chr> "North Dock B", "Kilkenny Rural", "Fermoy Rura…
$ nuts3name <chr> "Dublin", "South-East (IE)", "South-West (IE)"…
$ sa_id <chr> "A268108011", "A097063020", "A047151005", "A02…
$ cluster_id <chr> "n6_c0", "n6_c0", "n6_c5", "n6_c2", "n6_c0", "…
$ prod_type <fct> protection, protection, protection, protection…
$ prem_type <chr> "RP", "RP", "RP", "RP", "RP", "RP", "RP", "RP"…
$ prem_freq <chr> "12", "12", "12", "12", "12", "12", "12", "12"…
$ prem_ape <dbl> 4172.34, 1150.17, 313.05, 731.94, 2938.13, 799…
$ prem_risk <dbl> 2980.2396, 821.5525, 223.6095, 522.8157, 2098.…
$ policy_startdate <date> 1990-01-02, 1990-01-02, 1990-01-02, 1990-01-0…
$ policy_enddate <date> 2010-01-02, 2000-01-02, 2010-01-02, 2005-01-0…
$ policy_duration <int> 20, 10, 20, 15, 15, 15, 20, 5, 5, 15, 20, 20, …
$ mort_rating <dbl> 200, 100, 200, 100, 200, 100, 100, 200, 150, 1…
$ sum_assured <dbl> 150000, 450000, 250000, 450000, 500000, 200000…
$ dob_life1 <date> 1937-07-22, 1964-08-19, 1971-02-13, 1965-10-0…
$ gender_life1 <chr> "F", "M", "F", "F", "M", "M", "F", "M", "M", "…
$ smoker_life1 <chr> "S", "N", "S", "N", "S", "N", "N", "S", "Q", "…
$ isjointlife <lgl> TRUE, TRUE, FALSE, TRUE, FALSE, FALSE, FALSE, …
$ islifeonly <lgl> TRUE, FALSE, TRUE, TRUE, TRUE, TRUE, TRUE, TRU…
$ mortgage_status <chr> "MORTDECR", "MORTDECR", "MORTDECR", "TERM", "T…
$ policy_status <chr> "lapsed", "lapsed", "lapsed", "lapsed", "lapse…
$ policy_statuschangedate <date> 1998-10-02, 1993-08-02, 1991-10-02, 1990-08-0…
$ lapsed <lgl> TRUE, TRUE, TRUE, TRUE, TRUE, TRUE, FALSE, TRU…
$ weeks_to_now <dbl> 1356.429, 1356.429, 1356.429, 1356.429, 1356.4…
$ weeks_to_status <dbl> 456.428571, 186.857143, 91.142857, 30.285714, …
$ policy_lifetime <dbl> 456.428571, 186.857143, 91.142857, 30.285714, …
We now want to look at some very high level checks on the data, and we leverage some of the functionality provided by DataExplorer.
We first want to look at a visualisation of some high-level summarys of the meta-data on this dataset. This gives us a quick view of the categorical and 1numeric values in the dataset, as well as the proportions of missing values.
data_tbl |>
plot_intro(
title = "High Level Table Summary",
ggtheme = theme_cowplot()
)Before we do anything with the data, we first check for missing values in the dataset. In some cases, missing data is coded by a special character rather than as a blank, so we first correct for this.
### _TEMPLATE_
### ADD CODE TO CORRECT FOR DATA ENCODING HEREWith missing data properly encoded, we now visualise the missing data in a number of different ways.
data_tbl |>
plot_missing(
title = "Summary of Data Missingness",
group = list(Good = 0.05, Acceptable = 0.2, Bad = 0.8, Remove = 1),
ggtheme = theme_cowplot()
)We now want to repeat this plot but only for those columns that have some missing values.
data_tbl |>
plot_missing(
title = "Summary of Data Missingness (missing variables only)",
missing_only = TRUE,
group = list(Good = 0.05, Acceptable = 0.2, Bad = 0.8, Remove = 1),
ggtheme = theme_cowplot()
)It is useful to get an idea of what combinations of variables tend to have variables with missing values simultaneously, so to construct a visualisation for this we create a count of all the times given combinations of variables have missing values, producing a heat map for these combination counts.
dataexp_missing_group_count <- 20
row_count <- rawdata_tbl |> nrow()
count_nas <- ~ .x |> are_na() |> vec_cast(integer())
missing_vizdata_tbl <- rawdata_tbl |>
mutate(across(everything(), count_nas)) %>%
mutate(label = pmap_chr(., str_c)) |>
group_by(label) |>
mutate(
miss_count = n(),
miss_prop = miss_count / row_count
) |>
slice_max(order_by = miss_prop, n = 1, with_ties = FALSE) |>
ungroup() |>
pivot_longer(
!c(label, miss_count, miss_prop),
names_to = "variable_name",
values_to = "presence"
) |>
mutate(
prop_label = sprintf("%6.4f", miss_prop)
)
top10_data_tbl <- missing_vizdata_tbl |>
select(label, miss_prop) |>
distinct() |>
slice_max(order_by = miss_prop, n = dataexp_missing_group_count)
missing_plot_tbl <- missing_vizdata_tbl |>
semi_join(top10_data_tbl, by = "label")
ggplot(missing_plot_tbl) +
geom_tile(aes(x = variable_name, y = prop_label, fill = presence), height = 0.8) +
scale_fill_continuous() +
scale_x_discrete(position = "top", labels = ~ abbreviate(.x, minlength = 10)) +
xlab("Variable") +
ylab("Proportion of Rows") +
theme(
legend.position = "none",
axis.text.x = element_text(angle = 90, vjust = 0.5)
)This visualisation takes a little explaining.
Each row represents a combination of variables with simultaneous missing values. For each row in the graphic, the coloured entries show which particular variables are missing in that combination. The proportion of rows with that combination is displayed in both the label for the row and the colouring for the cells in the row.
With the raw data loaded up we now remove obvious unique or near-unique variables that are not amenable to basic exploration and plotting.
coltype_lst <- create_coltype_list(data_tbl)
count_levels <- ~ .x |> unique() |> length()
catvar_valuecount_tbl <- data_tbl |>
summarise(
.groups = "drop",
across(coltype_lst$split$discrete, count_levels)
) |>
pivot_longer(
cols = everything(),
names_to = "var_name",
values_to = "level_count"
) |>
arrange(desc(level_count))
print(catvar_valuecount_tbl)# A tibble: 13 × 2
var_name level_count
<chr> <int>
1 policy_id 773337
2 sa_id 18481
3 edname 3109
4 countyname 34
5 nuts3name 8
6 cluster_id 6
7 prem_freq 3
8 smoker_life1 3
9 mortgage_status 3
10 policy_status 3
11 gender_life1 2
12 prod_type 1
13 prem_type 1
row_count <- data_tbl |> nrow()
cat(glue("Dataset has {row_count} rows\n"))Dataset has 773337 rows
Now that we a table of the counts of all the categorical variables we can automatically exclude unique variables from the exploration, as the level count will match the row count.
unique_vars <- catvar_valuecount_tbl |>
filter(level_count == row_count) |>
pull(var_name)
print(unique_vars)[1] "policy_id"
explore_data_tbl <- data_tbl |>
select(-one_of(unique_vars))Having removed the unique identifier variables from the dataset, we may also wish to exclude categoricals with high level counts also, so we create a vector of those variable names.
highcount_vars <- catvar_valuecount_tbl |>
filter(level_count >= dataexp_level_exclusion_threshold,
level_count < row_count) |>
pull(var_name)
cat(str_c(highcount_vars, collapse = ", "))sa_id, edname
We now can continue doing some basic exploration of the data. We may also choose to remove some extra columns from the dataset.
### You may want to comment out these next few lines to customise which
### categoricals are kept in the exploration.
drop_vars <- c(highcount_vars)
if (length(drop_vars) > 0) {
explore_data_tbl <- explore_data_tbl |>
select(-one_of(drop_vars))
cat(str_c(drop_vars, collapse = ", "))
}sa_id, edname
Now that we have loaded the data we can prepare it for some basic data exploration.
We use a number of summary visualisations provided by DataExplorer: a facet plot across each variable with categorical variables getting bar plots and numerical plots getting histograms.
We first look at the barplots of categorical variables.
plot_bar(
data_tbl,
ncol = 2,
nrow = 2,
title = "Barplots of Data",
ggtheme = theme_cowplot()
)7 columns ignored with more than 50 categories.
policy_id: 773337 categories
edname: 3109 categories
sa_id: 18481 categories
policy_startdate: 6710 categories
policy_enddate: 18877 categories
dob_life1: 25130 categories
policy_statuschangedate: 9350 categories
We then have a quick look at histograms of the numeric variables.
plot_histogram(
data_tbl,
ncol = 2,
nrow = 2,
title = "Histograms of Data",
ggtheme = theme_cowplot()
)Finally, we split the remaining variables into different categories and then produce a sequence of plots for each variable.
coltype_lst <- create_coltype_list(explore_data_tbl)
print(coltype_lst)$split
$split$continuous
[1] "prem_ape" "prem_risk" "policy_duration" "mort_rating"
[5] "sum_assured" "weeks_to_now" "weeks_to_status" "policy_lifetime"
$split$datetime
[1] "policy_startdate" "policy_enddate"
[3] "dob_life1" "policy_statuschangedate"
$split$discrete
[1] "countyname" "nuts3name" "cluster_id" "prod_type"
[5] "prem_type" "prem_freq" "gender_life1" "smoker_life1"
[9] "mortgage_status" "policy_status"
$split$logical
[1] "isjointlife" "islifeonly" "lapsed"
$columns
countyname nuts3name cluster_id
"discrete" "discrete" "discrete"
prod_type prem_type prem_freq
"discrete" "discrete" "discrete"
prem_ape prem_risk policy_startdate
"continuous" "continuous" "datetime"
policy_enddate policy_duration mort_rating
"datetime" "continuous" "continuous"
sum_assured dob_life1 gender_life1
"continuous" "datetime" "discrete"
smoker_life1 isjointlife islifeonly
"discrete" "logical" "logical"
mortgage_status policy_status policy_statuschangedate
"discrete" "discrete" "datetime"
lapsed weeks_to_now weeks_to_status
"logical" "continuous" "continuous"
policy_lifetime
"continuous"
Logical variables only take two values: TRUE or FALSE. It is useful to see missing data as well though, so we also plot the count of those.
logical_vars <- coltype_lst$split$logical |> sort()
for (plot_varname in logical_vars) {
cat("--\n")
cat(glue("{plot_varname}\n"))
na_count <- explore_data_tbl |> pull(.data[[plot_varname]]) |> are_na() |> sum()
plot_title <- glue("Barplot of Counts for Variable: {plot_varname} ({na_count} missing values)")
explore_plot <- ggplot(explore_data_tbl) +
geom_bar(aes(x = .data[[plot_varname]])) +
xlab(plot_varname) +
ylab("Count") +
scale_y_continuous(labels = label_comma()) +
ggtitle(plot_title) +
theme(axis.text.x = element_text(angle = 30, vjust = 0.5))
plot(explore_plot)
}--
isjointlife
--
islifeonly
--
lapsed
Numeric variables are usually continuous in nature, though we also have integer data.
numeric_vars <- coltype_lst$split$continuous |> sort()
for (plot_varname in numeric_vars) {
cat("--\n")
cat(glue("{plot_varname}\n"))
plot_var <- explore_data_tbl |> pull(.data[[plot_varname]])
na_count <- plot_var |> are_na() |> sum()
plot_var |> summary() |> print()
plot_title <- glue("Histogram Plot for Variable: {plot_varname} ({na_count} missing values)")
all_plot <- ggplot() +
geom_histogram(aes(x = plot_var), bins = dataexp_hist_bins_count) +
geom_vline(xintercept = mean(plot_var, na.rm = TRUE),
colour = "red", size = 1.5) +
geom_vline(xintercept = median(plot_var, na.rm = TRUE),
colour = "green", size = 1.5) +
xlab(plot_varname) +
ylab("Count") +
scale_x_continuous(labels = label_comma()) +
scale_y_continuous(labels = label_comma()) +
ggtitle(
plot_title,
subtitle = "(red line is mean, green line is median)"
)
pos_data_tbl <- explore_data_tbl |>
filter(.data[[plot_varname]] >= 0) |>
mutate(var_val = abs(.data[[plot_varname]]))
pos_log_plot <- ggplot(pos_data_tbl) +
geom_histogram(aes(x = var_val), bins = dataexp_hist_bins_count) +
xlab(plot_varname) +
ylab("Count") +
scale_x_log10(labels = label_comma()) +
scale_y_continuous(labels = label_comma()) +
ggtitle("Positive Values")
neg_data_tbl <- explore_data_tbl |>
filter(.data[[plot_varname]] < 0) |>
mutate(var_val = abs(.data[[plot_varname]]))
neg_log_plot <- ggplot(neg_data_tbl) +
geom_histogram(aes(x = var_val), bins = dataexp_hist_bins_count) +
xlab(plot_varname) +
ylab("Count") +
scale_x_log10(labels = label_comma()) +
scale_y_continuous(labels = label_comma()) +
ggtitle("Negative Values")
plot_grid(
all_plot,
NULL,
pos_log_plot,
neg_log_plot,
nrow = 2
) |>
print()
}--
mort_rating Min. 1st Qu. Median Mean 3rd Qu. Max.
100.0 100.0 150.0 141.5 200.0 300.0
--
policy_duration Min. 1st Qu. Median Mean 3rd Qu. Max.
5.00 10.00 20.00 17.75 20.00 35.00
--
policy_lifetime Min. 1st Qu. Median Mean 3rd Qu. Max.
0.1429 91.2857 260.8571 308.1308 484.4286 1043.5714
--
prem_ape Min. 1st Qu. Median Mean 3rd Qu. Max.
3.637e+01 8.504e+02 1.886e+03 4.907e+03 4.428e+03 1.365e+06
--
prem_risk Min. 1st Qu. Median Mean 3rd Qu. Max.
25.98 607.42 1347.48 3504.96 3163.17 975008.71
--
sum_assured Min. 1st Qu. Median Mean 3rd Qu. Max.
100000 200000 300000 435722 450000 5000000
--
weeks_to_now Min. 1st Qu. Median Mean 3rd Qu. Max.
0.1429 464.2857 664.2857 657.5921 875.5714 1356.4286
--
weeks_to_status Min. 1st Qu. Median Mean 3rd Qu. Max.
0.0 0.0 113.0 197.8 304.4 1043.6
Categorical variables only have values from a limited, and usually fixed, number of possible values
categorical_vars <- coltype_lst$split$discrete |> sort()
for (plot_varname in categorical_vars) {
cat("--\n")
cat(glue("{plot_varname}\n"))
na_count <- explore_data_tbl |> pull(.data[[plot_varname]]) |> are_na() |> sum()
plot_title <- glue("Barplot of Counts for Variable: {plot_varname} ({na_count} missing values)")
standard_plot_tbl <- explore_data_tbl |>
count(.data[[plot_varname]])
standard_plot <- ggplot(standard_plot_tbl) +
geom_bar(aes(x = .data[[plot_varname]], weight = n)) +
xlab(plot_varname) +
ylab("Count") +
scale_x_discrete(labels = ~ abbreviate(.x, minlength = 10)) +
scale_y_continuous(labels = label_comma()) +
ggtitle(plot_title) +
theme(axis.text.x = element_text(angle = 30, vjust = 0.5))
standard_plot |> print()
desc_plot_tbl <- explore_data_tbl |>
pull(.data[[plot_varname]]) |>
fct_lump(n = dataexp_cat_level_count) |>
fct_count() |>
mutate(f = fct_relabel(f, str_trunc, width = 15))
desc_plot <- ggplot(desc_plot_tbl) +
geom_bar(aes(x = fct_reorder(f, -n), weight = n)) +
xlab(plot_varname) +
ylab("Count") +
scale_x_discrete(labels = ~ abbreviate(.x, minlength = 10)) +
scale_y_continuous(labels = label_comma()) +
ggtitle(plot_title) +
theme(axis.text.x = element_text(angle = 30, vjust = 0.5))
desc_plot |> print()
}--
cluster_id
--
countyname
--
gender_life1
--
mortgage_status
--
nuts3name
--
policy_status
--
prem_freq
--
prem_type
--
prod_type
--
smoker_life1
Date/Time variables represent calendar or time-based data should as time of the day, a date, or a timestamp.
datetime_vars <- coltype_lst$split$datetime |> sort()
for (plot_varname in datetime_vars) {
cat("--\n")
cat(glue("{plot_varname}\n"))
plot_var <- explore_data_tbl |> pull(.data[[plot_varname]])
na_count <- plot_var |> are_na() |> sum()
plot_var |> summary() |> print()
plot_title <- glue("Barplot of Dates/Times in Variable: {plot_varname} ({na_count} missing values)")
explore_plot <- ggplot(explore_data_tbl) +
geom_histogram(aes(x = .data[[plot_varname]]), bins = dataexp_hist_bins_count) +
xlab(plot_varname) +
ylab("Count") +
scale_y_continuous(labels = label_comma()) +
ggtitle(plot_title)
plot(explore_plot)
}--
dob_life1 Min. 1st Qu. Median Mean 3rd Qu. Max.
"1919-06-26" "1955-02-18" "1962-11-23" "1962-11-28" "1970-09-14" "1999-11-09"
--
policy_enddate Min. 1st Qu. Median Mean 3rd Qu. Max.
"1995-01-02" "2014-05-10" "2019-10-21" "2021-02-24" "2027-01-08" "2050-12-31"
--
policy_startdate Min. 1st Qu. Median Mean 3rd Qu. Max.
"1990-01-02" "1999-03-22" "2003-04-09" "2003-05-25" "2007-02-07" "2015-12-31"
--
policy_statuschangedate Min. 1st Qu. Median Mean 3rd Qu. Max.
"1990-02-02" "2003-05-24" "2007-06-17" "2007-03-10" "2011-06-13" "2015-12-31"
We now move on to looking at bivariate plots of the data set.
A natural way to explore relationships in data is to create univariate visualisations facetted by a categorical value.
### _TEMPLATE_
### facet_varname <- ''
facet_varname <- "cluster_id"
dataexp_facet_count_max <- 3For logical variables we facet on barplots of the levels, comparing TRUE, FALSE and missing data.
logical_vars <- logical_vars[!logical_vars %in% facet_varname] |> sort()
for (plot_varname in logical_vars) {
cat("--\n")
cat(plot_varname)
plot_tbl <- data_tbl |> filter(!are_na(.data[[plot_varname]]))
explore_plot <- ggplot(plot_tbl) +
geom_bar(aes(x = .data[[plot_varname]])) +
facet_wrap(facet_varname, scales = "free") +
xlab(plot_varname) +
ylab("Count") +
scale_y_continuous(labels = label_comma()) +
ggtitle(glue("{facet_varname}-Faceted Histogram for Variable: {plot_varname}")) +
theme(axis.text.x = element_text(angle = 30, vjust = 0.5))
plot(explore_plot)
}--
isjointlife
--
islifeonly
--
lapsed
For numeric variables, we facet on histograms of the data.
for (plot_varname in numeric_vars) {
cat("--\n")
cat(plot_varname)
plot_tbl <- data_tbl |> filter(!are_na(.data[[plot_varname]]))
explore_plot <- ggplot(plot_tbl) +
geom_histogram(aes(x = .data[[plot_varname]]), bins = dataexp_hist_bins_count) +
facet_wrap(facet_varname, scales = "free") +
xlab(plot_varname) +
ylab("Count") +
scale_x_continuous(labels = label_comma()) +
scale_y_continuous(labels = label_comma()) +
ggtitle(glue("{facet_varname}-Faceted Histogram for Variable: {plot_varname}")) +
theme(axis.text.x = element_text(angle = 30, vjust = 0.5))
print(explore_plot)
}--
mort_rating
--
policy_duration
--
policy_lifetime
--
prem_ape
--
prem_risk
--
sum_assured
--
weeks_to_now
--
weeks_to_status
We treat categorical variables like logical variables, faceting the barplots of the different levels of the data.
categorical_vars <- categorical_vars[!categorical_vars %in% facet_varname] |> sort()
for (plot_varname in categorical_vars) {
cat("--\n")
cat(plot_varname)
plot_tbl <- data_tbl |>
filter(!are_na(.data[[plot_varname]])) |>
mutate(
varname_trunc = fct_relabel(.data[[plot_varname]], str_trunc, width = 10)
)
explore_plot <- ggplot(plot_tbl) +
geom_bar(aes(x = varname_trunc)) +
facet_wrap(facet_varname, scales = "free") +
xlab(plot_varname) +
ylab("Count") +
scale_x_discrete(labels = ~ abbreviate(.x, minlength = 10)) +
scale_y_continuous(labels = label_comma()) +
ggtitle(glue("{facet_varname}-Faceted Histogram for Variable: {plot_varname}")) +
theme(axis.text.x = element_text(angle = 30, vjust = 0.5))
plot(explore_plot)
}--
countyname
--
gender_life1
--
mortgage_status
--
nuts3name
--
policy_status
--
prem_freq
--
prem_type
--
prod_type
--
smoker_life1
Like the univariate plots, we facet on histograms of the years in the dates.
for (plot_varname in datetime_vars) {
cat("--\n")
cat(plot_varname)
plot_tbl <- data_tbl |> filter(!are_na(.data[[plot_varname]]))
explore_plot <- ggplot(plot_tbl) +
geom_histogram(aes(x = .data[[plot_varname]]), bins = dataexp_hist_bins_count) +
facet_wrap(facet_varname, scales = "free") +
xlab(plot_varname) +
ylab("Count") +
scale_y_continuous(labels = label_comma()) +
ggtitle(glue("{facet_varname}-Faceted Histogram for Variable: {plot_varname}")) +
theme(axis.text.x = element_text(angle = 30, vjust = 0.5))
plot(explore_plot)
}--
dob_life1
--
policy_enddate
--
policy_startdate
--
policy_statuschangedate